Spam Analysis

This notebook will contain the codes and documentations for spam analysis. A report is generated on the analysis in the dropbox paper https://paper.dropbox.com/doc/Spam-Analysis-of-Maya-questions-NubDXwEKR6NDOBghYGgQ4.

First we will connect to our database.


In [1]:
from database import Database

database = Database(
    '<host name>',
    '<database name>',
    '<user name>',
    '<password>',
    'utf8mb4'
)

Spam by Source and User type (Registered & non-registered)


In [ ]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')
import pandas as pd

if connection:
    try:
        with connection.cursor() as cursor:
            query = "select count(*) as count, source from questions where status='spam' GROUP BY source,user_id is not null"
            cursor.execute(query)
            data = cursor.fetchall()
            df2 = pd.DataFrame(data)
            df2.plot.bar()
            plt.show()
            print data
    finally:
        connection.close()

Create Spam Type and Spam By Repeat Tables


In [4]:
def create_spam_type_table(cursor, connection):
    create_schema_sql = "CREATE TABLE spam_type(id int(11) unsigned NOT NULL AUTO_INCREMENT,question_id int(10) unsigned NOT NULL,type enum('repeat','abusive','random','greeting','irrelevant','test') DEFAULT NULL,PRIMARY KEY (id),KEY question_id (question_id),CONSTRAINT spam_type_ibfk_1 FOREIGN KEY (question_id) REFERENCES questions (id)) ENGINE=InnoDB DEFAULT CHARSET=utf8"
    cursor.execute(create_schema_sql)
    connection.commit()

def create_spam_repeat_table(cursor, connection):
    create_schema_sql = "CREATE TABLE spam_by_repeat(id int(11) unsigned NOT NULL AUTO_INCREMENT,question_id int(10) unsigned NOT NULL,parent_id int(10) unsigned NOT NULL,is_same_user tinyint(1) NOT NULL,time_dif int(11) unsigned NOT NULL,PRIMARY KEY (id),KEY question_id (question_id),KEY parent_id (parent_id),CONSTRAINT spam_by_repeat_ibfk_1 FOREIGN KEY (question_id) REFERENCES questions (id),CONSTRAINT spam_by_repeat_ibfk_2 FOREIGN KEY (parent_id) REFERENCES questions (id)) ENGINE=InnoDB DEFAULT CHARSET=utf8"
    cursor.execute(create_schema_sql)
    connection.commit()

Repeated Questions Count

Check for Greetings


In [ ]:
from pyxdameraulevenshtein import damerau_levenshtein_distance as dl_distance

def check_for_greeting(sentence):
    greeting_word = ['hi', 'hey', 'hello', 'bye', 'thank', 'কেমন'.decode('utf-8')]
    if len(sentence.split(' ')) < 10:
        # greetings
        for words in sentence.split(' '):
            for i in greeting_word:
                if dl_distance(i, words) <= 1:
                    sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','greeting')"
                    cursor.execute(sql)
                    connection.commit()
                    return True
    return False

Check for Testing Questions


In [ ]:
def check_for_test(sentence):
    testing_word = ['test', 'check', 'testing', 'checking']
    if len(sentence.split(' ')) < 10:
        # greetings
        for words in sentence.split(' '):
            for i in testing_word:
                if dl_distance(i, words) <= 1 or i in sentence:
                    sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','test')"
                    cursor.execute(sql)
                    connection.commit()
                    return True

    return False

Check for Random Characters


In [ ]:
def mark_as_random(record):
    sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','random')"
    cursor.execute(sql)
    connection.commit()

Check for Irrelevant Questions


In [ ]:
def check_for_irrelevant(sentence):
    irrelevant_word = ['voice']
    if len(sentence.split(' ')) < 10:
        # greetings
        for words in sentence.split(' '):
            for i in irrelevant_word:
                if dl_distance(i, words) <= 1:
                    sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','irrelevant')"
                    cursor.execute(sql)
                    connection.commit()
                    return True
    return False

Check for Abusive Questions


In [ ]:
def check_for_abusive(sentence):
    abusive_word = ['sex','সেক্স'.decode('utf-8'),'যৌন'.decode('utf-8'),'দুধ'.decode('utf-8'),'চুদ'.decode('utf-8'), 'লিঙ্গ'.decode('utf-8')]
    # greetings
    for words in sentence.split(' '):
        for i in abusive_word:
            if dl_distance(i, words) <= 1 or i in sentence:
                sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','abusive')"
                cursor.execute(sql)
                connection.commit()
                return True
    return False

Check for Repeated Questions


In [ ]:
def check_for_repeat(record):
    sql = "SELECT id, email, created_at FROM questions WHERE id < " + str(record['id']) + " and body='" + record['body'] + "'"
    cursor.execute(sql)
    result = cursor.fetchall()
    if result:
        match = 0
        for i in result:
            if i['id'] > match:
                match = i['id']
                data = i

        sql = "INSERT INTO spam_type(question_id, type) VALUES('" + str(record['id']) + "','repeat')"
        cursor.execute(sql)
        connection.commit()
        if (data['email'] == record['email']):
            sql = "INSERT INTO spam_by_repeat(question_id, parent_id, is_same_user, time_dif) VALUES('" + str(record['id']) + "','" + str(data['id']) + "','1','" + str(abs((record['created_at'] - data['created_at']).total_seconds())) + "')"
        else:
            sql = "INSERT INTO spam_by_repeat(question_id, parent_id, is_same_user, time_dif) VALUES('" + str(record['id']) + "','" + str(data['id']) + "','0','" + str(abs((record['created_at'] - data['created_at']).total_seconds())) + "')"
        cursor.execute(sql)
        connection.commit()
        return True
    return False

Spam Analysis by Type


In [5]:
connection = database.connect_with_pymysql()
if connection:
    try:
        with connection.cursor() as cursor:
            create_spam_type_table(cursor, connection)
            create_spam_repeat_table(cursor, connection)
            sql = "SELECT id, body, email, source, created_at FROM questions WHERE status='spam'"
            cursor.execute(sql)
            data = cursor.fetchall()
            for record in data:
                if record['body']:
                    # no vowel
                    if check_for_greeting(record['body']):
                        continue
                    elif check_for_test(record['body']):
                        continue
                    elif check_for_irrelevant(record['body']):
                        continue
                    elif len(record['body'].split(' ')) <= 3:
                        mark_as_random(record)
                        continue
                    elif check_for_repeat(record):
                        continue
                    elif check_for_abusive(record['body']):
                        continue
                    sql = "INSERT INTO spam_type(question_id) VALUES('" + str(record['id']) + "')"
                    cursor.execute(sql)
                    connection.commit()
                else:
                    # random meaningless characters or blank message
                    mark_as_random(record)
    finally:
        connection.close()

Repeated questions by Time and User


In [ ]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
timeline = []
connection = database.connect_with_pymysql()
if connection:
    try:
        with connection.cursor() as cursor:
            sql = "SELECT time_dif FROM spam_by_repeat where is_same_user=0"
            cursor.execute(sql)
            result = cursor.fetchall()
    finally:
        connection.close()
for i in result:
    timeline.append(i['time_dif']/60)
plt.style.use('ggplot')
ranges = [0, 2, 60, 1440, 10080, sys.maxint]
col = ['<2min', '2-60min', '1-24hr', '1-7day', '>1week']

val = np.zeros(5)

for i in range(len(ranges)-1):
    for j in timeline:
        if ranges[i] <= j < ranges[i+1]:
            val[i] += 1
df2 = pd.DataFrame(np.array(val), col, columns=['Count of repeated questions by time by different user'])
ax = df2.plot.bar()
for p in ax.patches:
    b=p.get_bbox()
    ax.annotate("{}".format(int(b.y1 + b.y0)), ((b.x0 + b.x1)/2 - 0.1, b.y1))
plt.show()